//Section 1 - Introduction: Introduce your general and specific research question(s). What is the general problem area that this analysis contributes to? What specific problem are you trying to solve? Why is this important? Why is this hard? What does theory / prior work tell us about this problem and how are you extending it? What is your approach?

//Section 2 - Data overview: Describe your data at a high level, answering questions such as what are the entities in the data, how many entities are there, and what are the features or feature families relevant to the problem you’re tackling?

library(tidyverse)
library(tidymodels)
library(tidytext)
library(textrecipes)
library(here)
library(moderndive)
theme_set(theme_minimal())
scotblue <- "#0065BF"
ukred <- "#D00C27"
data <- read_csv('Data/Data.csv', show_col_types = FALSE)
New names:
• `` -> `...1`
data_acl <- data %>% filter(conference == 'acl_2017')
data_conll <- data %>% filter(conference == 'conll_2016')
data_acl <- data %>% filter(conference == 'acl_2017')
data_conll <- data %>% filter(conference == 'conll_2016')
set.seed(1234)
data_splot <- initial_split(data, strata = RECOMMENDATION)
train <- training(data_splot)
test  <- testing(data_splot)
confint(RECOMMENDATION_model)
                             2.5 %     97.5 %
(Intercept)           -1.972593146 0.44889629
IMPACT                -0.149811797 0.21050962
SUBSTANCE              0.396858776 0.63581070
APPROPRIATENESS       -0.039465619 0.40327408
MEANINGFUL_COMPARISON  0.001246276 0.22906417
SOUNDNESS_CORRECTNESS -0.091965316 0.13919052
ORIGINALITY           -0.098122069 0.14034219
CLARITY                0.115166033 0.32227939
REVIEWER_CONFIDENCE   -0.163537596 0.06747741
model_points <- get_regression_points(RECOMMENDATION_model)
ggplot(score_model_points, aes(x = RECOMMENDATION_hat)) +
  geom_histogram(bins = 20) +
  labs(x = "Residual", y = "Count")

ggplot(model_points, aes(x = SUBSTANCE, y = residual)) +
  geom_point() +
  labs(x = "SUBSTANCE", y = "RECOMMENDATION_hat")

set.seed(1234)
data_split <- initial_split(data_acl, strata = RECOMMENDATION)
train_acl <- training(data_split)
test_acl  <- testing(data_split)
set.seed(1234)
data_split <- initial_split(data_conll, strata = RECOMMENDATION)
train_conll <- training(data_split)
test_conll  <- testing(data_split)
RECOMMENDATION_model <- lm(RECOMMENDATION ~ IMPACT+SUBSTANCE+APPROPRIATENESS+MEANINGFUL_COMPARISON+SOUNDNESS_CORRECTNESS+
ORIGINALITY+CLARITY+REVIEWER_CONFIDENCE
 ,data = train_acl)
RECOMMENDATION_model

Call:
lm(formula = RECOMMENDATION ~ IMPACT + SUBSTANCE + APPROPRIATENESS + 
    MEANINGFUL_COMPARISON + SOUNDNESS_CORRECTNESS + ORIGINALITY + 
    CLARITY + REVIEWER_CONFIDENCE, data = train_acl)

Coefficients:
          (Intercept)                 IMPACT              SUBSTANCE  
             -0.83341               -0.01740                0.55182  
      APPROPRIATENESS  MEANINGFUL_COMPARISON  SOUNDNESS_CORRECTNESS  
              0.13264                0.12511               -0.03872  
          ORIGINALITY                CLARITY    REVIEWER_CONFIDENCE  
              0.09463                0.19268                0.05205  
RECOMMENDATION_model_points <- get_regression_points(RECOMMENDATION_model)

model_points <- get_regression_points(RECOMMENDATION_model)
ggplot(model_points, aes(x = RECOMMENDATION_hat)) +
  geom_histogram(bins = 20) +
  labs(x = "Residual", y = "Count")

RECOMMENDATION_model <- lm(RECOMMENDATION ~ IMPACT+SUBSTANCE+APPROPRIATENESS+MEANINGFUL_COMPARISON+SOUNDNESS_CORRECTNESS+
ORIGINALITY+CLARITY+REVIEWER_CONFIDENCE
 ,data = train_conll)
RECOMMENDATION_model

Call:
lm(formula = RECOMMENDATION ~ IMPACT + SUBSTANCE + APPROPRIATENESS + 
    MEANINGFUL_COMPARISON + SOUNDNESS_CORRECTNESS + ORIGINALITY + 
    CLARITY + REVIEWER_CONFIDENCE, data = train_conll)

Coefficients:
          (Intercept)                 IMPACT              SUBSTANCE  
             -2.03045                0.04700                0.16525  
      APPROPRIATENESS  MEANINGFUL_COMPARISON  SOUNDNESS_CORRECTNESS  
              0.16999                0.34972                0.45175  
          ORIGINALITY                CLARITY    REVIEWER_CONFIDENCE  
              0.21587               -0.01760                0.02112  
RECOMMENDATION_model_points <- get_regression_points(RECOMMENDATION_model)

model_points <- get_regression_points(RECOMMENDATION_model)
ggplot(model_points, aes(x = RECOMMENDATION_hat)) +
  geom_histogram(bins = 20) +
  labs(x = "Residual", y = "Count")


train_rec <-
  recipe(RECOMMENDATION ~ IMPACT+SUBSTANCE+APPROPRIATENESS+MEANINGFUL_COMPARISON+SOUNDNESS_CORRECTNESS+
ORIGINALITY+CLARITY+REVIEWER_CONFIDENCE
 ,data = train) %>%
  step_naomit(everything(), skip = TRUE) %>% 
  step_novel(all_nominal(), -all_outcomes()) %>%
  step_normalize(all_numeric(), -all_outcomes(), 
                 -longitude, -latitude) %>% 
  step_dummy(all_nominal(), -all_outcomes()) %>%
  step_zv(all_numeric(), -all_outcomes()) %>%
  step_corr(all_predictors(), threshold = 0.7, method = "spearman") 

//Random Forest

library(ranger)

rf_spec <- 
  rand_forest() %>% 
  set_engine("ranger", importance = "impurity") %>% 
  set_mode("classification")

rf_wflow <-
 workflow() %>%
 add_recipe(train_rec) %>% 
 add_model(rf_spec) 

cv_folds <-
 vfold_cv(train, 
          v = 10, 
          strata = RECOMMENDATION) 
cv_folds
#  10-fold cross-validation using stratification 
log_res <- 
  rf_wflow %>% 
  fit_resamples(
    resamples = cv_folds, 
    metrics = metric_set(
      recall, precision, f_meas, 
      accuracy, kap,
      roc_auc, sens, spec),
    control = control_resamples(
      save_pred = TRUE)
    ) 
x Fold01: preprocessor 1/1: Error in `chr_as_location...
x Fold02: preprocessor 1/1: Error in `chr_as_location...
x Fold03: preprocessor 1/1: Error in `chr_as_location...
x Fold04: preprocessor 1/1: Error in `chr_as_location...
x Fold05: preprocessor 1/1: Error in `chr_as_location...
x Fold06: preprocessor 1/1: Error in `chr_as_location...
x Fold07: preprocessor 1/1: Error in `chr_as_location...
x Fold08: preprocessor 1/1: Error in `chr_as_location...
x Fold09: preprocessor 1/1: Error in `chr_as_location...
x Fold10: preprocessor 1/1: Error in `chr_as_location...
Warning: All models failed. See the `.notes` column.
LS0tCnRpdGxlOiAiUiBOb3RlYm9vayIKb3V0cHV0OiBodG1sX25vdGVib29rCi0tLQoKCi8vU2VjdGlvbiAxIC0gSW50cm9kdWN0aW9uOiBJbnRyb2R1Y2UgeW91ciBnZW5lcmFsIGFuZCBzcGVjaWZpYyByZXNlYXJjaCBxdWVzdGlvbihzKS4gV2hhdCBpcyB0aGUgZ2VuZXJhbCBwcm9ibGVtIGFyZWEgdGhhdCB0aGlzIGFuYWx5c2lzIGNvbnRyaWJ1dGVzIHRvPyBXaGF0IHNwZWNpZmljIHByb2JsZW0gYXJlIHlvdSB0cnlpbmcgdG8gc29sdmU/IFdoeSBpcyB0aGlzIGltcG9ydGFudD8gV2h5IGlzIHRoaXMgaGFyZD8gV2hhdCBkb2VzIHRoZW9yeSAvIHByaW9yIHdvcmsgdGVsbCB1cyBhYm91dCB0aGlzIHByb2JsZW0gYW5kIGhvdyBhcmUgeW91IGV4dGVuZGluZyBpdD8gV2hhdCBpcyB5b3VyIGFwcHJvYWNoPwoKCi8vU2VjdGlvbiAyIC0gRGF0YSBvdmVydmlldzogRGVzY3JpYmUgeW91ciBkYXRhIGF0IGEgaGlnaCBsZXZlbCwgYW5zd2VyaW5nIHF1ZXN0aW9ucyBzdWNoIGFzIHdoYXQgYXJlIHRoZSBlbnRpdGllcyBpbiB0aGUgZGF0YSwgaG93IG1hbnkgZW50aXRpZXMgYXJlIHRoZXJlLCBhbmQgd2hhdCBhcmUgdGhlIGZlYXR1cmVzIG9yIGZlYXR1cmUgZmFtaWxpZXMgcmVsZXZhbnQgdG8gdGhlIHByb2JsZW0geW91J3JlIHRhY2tsaW5nPwoKCmBgYHtyIGxvYWRfcGFja2FnZXN9CmxpYnJhcnkodGlkeXZlcnNlKQpsaWJyYXJ5KHRpZHltb2RlbHMpCmxpYnJhcnkodGlkeXRleHQpCmxpYnJhcnkodGV4dHJlY2lwZXMpCmxpYnJhcnkoaGVyZSkKbGlicmFyeShtb2Rlcm5kaXZlKQpsaWJyYXJ5KHJhbmdlcikKCmBgYAoKCmBgYHtyIHNldF9zdHlsZX0KdGhlbWVfc2V0KHRoZW1lX21pbmltYWwoKSkKc2NvdGJsdWUgPC0gIiMwMDY1QkYiCnVrcmVkIDwtICIjRDAwQzI3IgpgYGAKCmBgYHtyIGxvYWRfZGF0YX0KZGF0YSA8LSByZWFkX2NzdignRGF0YS9EYXRhLmNzdicsIHNob3dfY29sX3R5cGVzID0gRkFMU0UpCmBgYApgYGB7ciBzcGxpdCBjb25sbCBhbmQgYWNsfQpkYXRhX2FjbCA8LSBkYXRhICU+JSBmaWx0ZXIoY29uZmVyZW5jZSA9PSAnYWNsXzIwMTcnKQpkYXRhX2NvbmxsIDwtIGRhdGEgJT4lIGZpbHRlcihjb25mZXJlbmNlID09ICdjb25sbF8yMDE2JykKYGBgCgoKYGBge3Igc3BsaXRfYWxsfQpzZXQuc2VlZCgxMjM0KQpkYXRhX3NwbG90IDwtIGluaXRpYWxfc3BsaXQoZGF0YSwgc3RyYXRhID0gUkVDT01NRU5EQVRJT04pCnRyYWluX2FsbCA8LSB0cmFpbmluZyhkYXRhX3NwbG90KQp0ZXN0X2FsbCAgPC0gdGVzdGluZyhkYXRhX3NwbG90KQpgYGAKCgpgYGB7ciBzZXRfbGluZWFyX21vZGVsfQpSRUNPTU1FTkRBVElPTl9tb2RlbCA8LSBsbShSRUNPTU1FTkRBVElPTiB+IElNUEFDVCtTVUJTVEFOQ0UrQVBQUk9QUklBVEVORVNTK01FQU5JTkdGVUxfQ09NUEFSSVNPTitTT1VORE5FU1NfQ09SUkVDVE5FU1MrCk9SSUdJTkFMSVRZK0NMQVJJVFkrUkVWSUVXRVJfQ09ORklERU5DRSxkYXRhID0gdHJhaW4pCgpgYGAKCgpgYGB7cn0KbW9kZWxfcG9pbnRzIDwtIGdldF9yZWdyZXNzaW9uX3BvaW50cyhSRUNPTU1FTkRBVElPTl9tb2RlbCkKZ2dwbG90KHNjb3JlX21vZGVsX3BvaW50cywgYWVzKHggPSBSRUNPTU1FTkRBVElPTl9oYXQpKSArCiAgZ2VvbV9oaXN0b2dyYW0oYmlucyA9IDIwKSArCiAgbGFicyh4ID0gIlJlc2lkdWFsIiwgeSA9ICJDb3VudCIpCmBgYAoKCmBgYHtyfQpnZ3Bsb3QobW9kZWxfcG9pbnRzLCBhZXMoeCA9IFNVQlNUQU5DRSwgeSA9IHJlc2lkdWFsKSkgKwogIGdlb21fcG9pbnQoKSArCiAgbGFicyh4ID0gIlNVQlNUQU5DRSIsIHkgPSAiUkVDT01NRU5EQVRJT05faGF0IikKYGBgCgpgYGB7ciBzcGxpdF9hY2x9CnNldC5zZWVkKDEyMzQpCmRhdGFfc3BsaXQgPC0gaW5pdGlhbF9zcGxpdChkYXRhX2FjbCwgc3RyYXRhID0gUkVDT01NRU5EQVRJT04pCnRyYWluX2FjbCA8LSB0cmFpbmluZyhkYXRhX3NwbGl0KQp0ZXN0X2FjbCAgPC0gdGVzdGluZyhkYXRhX3NwbGl0KQpgYGAKCgpgYGB7ciBzcGxpdF9jb25sbH0Kc2V0LnNlZWQoMTIzNCkKZGF0YV9zcGxpdCA8LSBpbml0aWFsX3NwbGl0KGRhdGFfY29ubGwsIHN0cmF0YSA9IFJFQ09NTUVOREFUSU9OKQp0cmFpbl9jb25sbCA8LSB0cmFpbmluZyhkYXRhX3NwbGl0KQp0ZXN0X2NvbmxsICA8LSB0ZXN0aW5nKGRhdGFfc3BsaXQpCmBgYAoKCmBgYHtyIHNldF9saW5lYXJfbW9kZWxfYWNsfQpSRUNPTU1FTkRBVElPTl9tb2RlbCA8LSBsbShSRUNPTU1FTkRBVElPTiB+IElNUEFDVCtTVUJTVEFOQ0UrQVBQUk9QUklBVEVORVNTK01FQU5JTkdGVUxfQ09NUEFSSVNPTitTT1VORE5FU1NfQ09SUkVDVE5FU1MrCk9SSUdJTkFMSVRZK0NMQVJJVFkrUkVWSUVXRVJfQ09ORklERU5DRQogLGRhdGEgPSB0cmFpbl9hY2wpClJFQ09NTUVOREFUSU9OX21vZGVsCmBgYAoKCgpgYGB7ciBoaXN0b2dyYW1fYWNsfQpSRUNPTU1FTkRBVElPTl9tb2RlbF9wb2ludHMgPC0gZ2V0X3JlZ3Jlc3Npb25fcG9pbnRzKFJFQ09NTUVOREFUSU9OX21vZGVsKQoKbW9kZWxfcG9pbnRzIDwtIGdldF9yZWdyZXNzaW9uX3BvaW50cyhSRUNPTU1FTkRBVElPTl9tb2RlbCkKZ2dwbG90KG1vZGVsX3BvaW50cywgYWVzKHggPSBSRUNPTU1FTkRBVElPTl9oYXQpKSArCiAgZ2VvbV9oaXN0b2dyYW0oYmlucyA9IDIwKSArCiAgbGFicyh4ID0gIlJlc2lkdWFsIiwgeSA9ICJDb3VudCIpCmBgYAoKCmBgYHtyIH0KUkVDT01NRU5EQVRJT05fbW9kZWwgPC0gbG0oUkVDT01NRU5EQVRJT04gfiBJTVBBQ1QrU1VCU1RBTkNFK0FQUFJPUFJJQVRFTkVTUytNRUFOSU5HRlVMX0NPTVBBUklTT04rU09VTkRORVNTX0NPUlJFQ1RORVNTKwpPUklHSU5BTElUWStDTEFSSVRZK1JFVklFV0VSX0NPTkZJREVOQ0UKICxkYXRhID0gdHJhaW5fY29ubGwpClJFQ09NTUVOREFUSU9OX21vZGVsCgpgYGAKYGBge3J9ClJFQ09NTUVOREFUSU9OX21vZGVsX3BvaW50cyA8LSBnZXRfcmVncmVzc2lvbl9wb2ludHMoUkVDT01NRU5EQVRJT05fbW9kZWwpCgptb2RlbF9wb2ludHMgPC0gZ2V0X3JlZ3Jlc3Npb25fcG9pbnRzKFJFQ09NTUVOREFUSU9OX21vZGVsKQpnZ3Bsb3QobW9kZWxfcG9pbnRzLCBhZXMoeCA9IFJFQ09NTUVOREFUSU9OX2hhdCkpICsKICBnZW9tX2hpc3RvZ3JhbShiaW5zID0gMjApICsKICBsYWJzKHggPSAiUmVzaWR1YWwiLCB5ID0gIkNvdW50IikKYGBgCgoKCgpgYGB7cn0KCnRyYWluX3JlYyA8LQogIHJlY2lwZShSRUNPTU1FTkRBVElPTiB+IElNUEFDVCtTVUJTVEFOQ0UrQVBQUk9QUklBVEVORVNTK01FQU5JTkdGVUxfQ09NUEFSSVNPTitTT1VORE5FU1NfQ09SUkVDVE5FU1MrCk9SSUdJTkFMSVRZK0NMQVJJVFkrUkVWSUVXRVJfQ09ORklERU5DRQogLGRhdGEgPSB0cmFpbikgJT4lCiAgc3RlcF9uYW9taXQoZXZlcnl0aGluZygpLCBza2lwID0gVFJVRSkgJT4lIAogIHN0ZXBfbm92ZWwoYWxsX25vbWluYWwoKSwgLWFsbF9vdXRjb21lcygpKSAlPiUKICBzdGVwX25vcm1hbGl6ZShhbGxfbnVtZXJpYygpLCAtYWxsX291dGNvbWVzKCksIAogICAgICAgICAgICAgICAgIC1sb25naXR1ZGUsIC1sYXRpdHVkZSkgJT4lIAogIHN0ZXBfZHVtbXkoYWxsX25vbWluYWwoKSwgLWFsbF9vdXRjb21lcygpKSAlPiUKICBzdGVwX3p2KGFsbF9udW1lcmljKCksIC1hbGxfb3V0Y29tZXMoKSkgJT4lCiAgc3RlcF9jb3JyKGFsbF9wcmVkaWN0b3JzKCksIHRocmVzaG9sZCA9IDAuNywgbWV0aG9kID0gInNwZWFybWFuIikgCmBgYAoKLy9SYW5kb20gRm9yZXN0IApgYGB7cn0KCnJmX3NwZWMgPC0gCiAgcmFuZF9mb3Jlc3QoKSAlPiUgCiAgc2V0X2VuZ2luZSgicmFuZ2VyIiwgaW1wb3J0YW5jZSA9ICJpbXB1cml0eSIpICU+JSAKICBzZXRfbW9kZSgiY2xhc3NpZmljYXRpb24iKQoKcmZfd2Zsb3cgPC0KIHdvcmtmbG93KCkgJT4lCiBhZGRfcmVjaXBlKHRyYWluX3JlYykgJT4lIAogYWRkX21vZGVsKHJmX3NwZWMpIAoKYGBgCgpgYGB7cn0KY3ZfZm9sZHMgPC0KIHZmb2xkX2N2KHRyYWluLCAKICAgICAgICAgIHYgPSAxMCwgCiAgICAgICAgICBzdHJhdGEgPSBSRUNPTU1FTkRBVElPTikgCmN2X2ZvbGRzCmBgYAoKYGBge3J9CmxvZ19yZXMgPC0gCiAgcmZfd2Zsb3cgJT4lIAogIGZpdF9yZXNhbXBsZXMoCiAgICByZXNhbXBsZXMgPSBjdl9mb2xkcywgCiAgICBtZXRyaWNzID0gbWV0cmljX3NldCgKICAgICAgcmVjYWxsLCBwcmVjaXNpb24sIGZfbWVhcywgCiAgICAgIGFjY3VyYWN5LCBrYXAsCiAgICAgIHJvY19hdWMsIHNlbnMsIHNwZWMpLAogICAgY29udHJvbCA9IGNvbnRyb2xfcmVzYW1wbGVzKAogICAgICBzYXZlX3ByZWQgPSBUUlVFKQogICAgKSAKCmBgYAoKCgoK